Customer Behavioural Analytics in the Retail Sector
library(tidyverse)
library(dplyr)
library(ggplot2)
library(factoextra)
library("RColorBrewer")
library(cluster)
library("metricsgraphics")
# read file contents
supermarket_data_clean <- read.csv("Supermarket_DataCleaned.csv")
# Prepare data frames for clustering
#cloumns custoner_id, amount_purchased_shop_1, 2, 3, 4, 5
cluster.slice.temp <- supermarket_data_clean[,c(1,29,30,31,32,33)]
cluster.slice.data <- supermarket_data_clean[,c(29,30,31,32,33)]
# Scale the data and Determine the ideal number of clusters
cluster.slice.scale <- scale(cluster.slice.data)
cluster.slice.data.bot<-sample_frac(cluster.slice.data, 0.5)
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
wssplot(cluster.slice.scale)

# The sharp decreases from 1 to 4 clusters with a little decrease between 4 to 5 estimates a 4 or 5 cluster solution
#K-means clustering
set.seed(123) # fix the random starting clusters
kclust4 <- kmeans(cluster.slice.data.bot, 4, nstart = 25)
set.seed(123) # fix the random starting clusters
kclust5 <- kmeans(cluster.slice.data.bot, 5, nstart = 25)
#PCA to visualize and verify the appropriate number of clusters
cluster.pc <- prcomp(cluster.slice.data.bot, center = FALSE, scale. = FALSE)$x %>% as.data.frame()
cluster.pc$kmeans.cluster <- kclust4$cluster
mjs_plot(cluster.pc, x=PC1, y=PC2) %>%
mjs_point(color_accessor=kmeans.cluster) %>%
mjs_labs(x="principal comp 1", y="principal comp 2")
#PCA to visualize and verify the appropriate number of clusters
cluster.pc1 <- prcomp(cluster.slice.data.bot, center = FALSE, scale. = FALSE)$x %>% as.data.frame()
cluster.pc1$kmeans.cluster <- kclust5$cluster
mjs_plot(cluster.pc1, x=PC1, y=PC2) %>%
mjs_point(color_accessor=kmeans.cluster) %>%
mjs_labs(x="principal comp 1", y="principal comp 2")
#Comparing the PCA plots determine K-means 5 cluster solution, Hence clustering visualization with 5 clusters
fviz_cluster(kclust5, data = cluster.slice.data.bot, geom = "point",
stand = FALSE, ellipse.type = "norm") +
ggtitle(label='Customer Clusters')
